# Uncomment if packages not installed
## install.packages("psych")
## install.packages("dplyr")
## install.packages("ggpubr")
## install.packages("cowplot")
## install.packages("lavaan")
## install.packages("plotly")
## install.packages("tidySEM")



# Load data
setwd('D:\\Yaxin\\HKBU BM\\Courses\\Sem 2\\ECON7860 Big Data Analytics for Business (S11)\\Group Project\\HR Analytics\\working')
rawData <- read.csv2("HR_comma_sep.csv", sep = ',')



# Check missing values
TRUE %in% is.na(rawData)
## [1] FALSE
# Have a glance of the data
colnames(rawData)
##  [1] "satisfaction_level"    "last_evaluation"       "number_project"       
##  [4] "average_montly_hours"  "time_spend_company"    "Work_accident"        
##  [7] "left"                  "promotion_last_5years" "sales"                
## [10] "salary"
str(rawData)
## 'data.frame':    14999 obs. of  10 variables:
##  $ satisfaction_level   : chr  "0.38" "0.8" "0.11" "0.72" ...
##  $ last_evaluation      : chr  "0.53" "0.86" "0.88" "0.87" ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sales                : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
summary(rawData)
##  satisfaction_level last_evaluation    number_project  average_montly_hours
##  Length:14999       Length:14999       Min.   :2.000   Min.   : 96.0       
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.:156.0       
##  Mode  :character   Mode  :character   Median :4.000   Median :200.0       
##                                        Mean   :3.803   Mean   :201.1       
##                                        3rd Qu.:5.000   3rd Qu.:245.0       
##                                        Max.   :7.000   Max.   :310.0       
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   Min.   :0.0000   Min.   :0.00000      
##  1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000      
##  Median : 3.000     Median :0.0000   Median :0.0000   Median :0.00000      
##  Mean   : 3.498     Mean   :0.1446   Mean   :0.2381   Mean   :0.02127      
##  3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :10.000     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000      
##     sales              salary         
##  Length:14999       Length:14999      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
table(rawData$Work_accident)
## 
##     0     1 
## 12830  2169
table(rawData$promotion_last_5years)
## 
##     0     1 
## 14680   319
table(rawData$sales)
## 
##  accounting          hr          IT  management   marketing product_mng 
##         767         739        1227         630         858         902 
##       RandD       sales     support   technical 
##         787        4140        2229        2720
table(rawData$salary)
## 
##   high    low medium 
##   1237   7316   6446
table(rawData$left)
## 
##     0     1 
## 11428  3571
# Transform "satisfaction_level" and "last_evaluation" to numeric
rawData$satisfaction_level <- as.numeric(rawData$satisfaction_level)
rawData$last_evaluation <- as.numeric(rawData$last_evaluation)
summary(rawData)
##  satisfaction_level last_evaluation  number_project  average_montly_hours
##  Min.   :0.0900     Min.   :0.3600   Min.   :2.000   Min.   : 96.0       
##  1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0       
##  Median :0.6400     Median :0.7200   Median :4.000   Median :200.0       
##  Mean   :0.6128     Mean   :0.7161   Mean   :3.803   Mean   :201.1       
##  3rd Qu.:0.8200     3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0       
##  Max.   :1.0000     Max.   :1.0000   Max.   :7.000   Max.   :310.0       
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   Min.   :0.0000   Min.   :0.00000      
##  1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000      
##  Median : 3.000     Median :0.0000   Median :0.0000   Median :0.00000      
##  Mean   : 3.498     Mean   :0.1446   Mean   :0.2381   Mean   :0.02127      
##  3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :10.000     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000      
##     sales              salary         
##  Length:14999       Length:14999      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# Separate numeric variables and target variable
X <- rawData[ , c(1, 2, 3, 4, 5, 7)]
tag <- colnames(X)
y <- rawData$left



# Check for outliers (of numeric variables) with boxplots
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.0.4
## Loading required package: ggplot2
library("cowplot")
## Warning: package 'cowplot' was built under R version 4.0.4
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
bxp1 <- ggboxplot(X[, 1], xlab = tag[1], ylab = tag[1], fill = '#999999')
bxp2 <- ggboxplot(X[, 2], xlab = tag[2], ylab = tag[2], fill = '#999999')
bxp3 <- ggboxplot(X[, 3], xlab = tag[3], ylab = tag[3], fill = '#999999')
bxp4 <- ggboxplot(X[, 4], xlab = tag[4], ylab = tag[4], fill = '#999999')
bxp5 <- ggboxplot(X[, 5], xlab = tag[5], ylab = tag[5], fill = '#999999')

plot_grid(bxp1, bxp2, bxp3, bxp4, bxp5 + rremove("x.text"), 
          labels = tag, ncol = 3, nrow = 2, label_size = 8)

# Identify outliers for "time_spend_company"
out <- boxplot.stats(X[ , 5])$out
table(out)
## out
##   6   7   8  10 
## 718 188 162 214
length(out)
## [1] 1282
# Identify the subjects corresponding to the "extreme" values
out_ind <- which(X[ , 5] %in% out)



# Examine if the outlier dataset is a different population
X1 <- X[out_ind, ]; X2 <- X[-out_ind, ]
summary(X1)
##  satisfaction_level last_evaluation number_project  average_montly_hours
##  Min.   :0.1200     Min.   :0.360   Min.   :2.000   Min.   : 97.0       
##  1st Qu.:0.4700     1st Qu.:0.600   1st Qu.:3.000   1st Qu.:164.0       
##  Median :0.6650     Median :0.760   Median :4.000   Median :215.0       
##  Mean   :0.6247     Mean   :0.735   Mean   :4.016   Mean   :205.9       
##  3rd Qu.:0.8300     3rd Qu.:0.890   3rd Qu.:5.000   3rd Qu.:248.0       
##  Max.   :1.0000     Max.   :1.000   Max.   :6.000   Max.   :293.0       
##  time_spend_company      left      
##  Min.   : 6.000     Min.   :0.000  
##  1st Qu.: 6.000     1st Qu.:0.000  
##  Median : 6.000     Median :0.000  
##  Mean   : 7.067     Mean   :0.163  
##  3rd Qu.: 8.000     3rd Qu.:0.000  
##  Max.   :10.000     Max.   :1.000
summary(X2)
##  satisfaction_level last_evaluation  number_project  average_montly_hours
##  Min.   :0.0900     Min.   :0.3600   Min.   :2.000   Min.   : 96.0       
##  1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0       
##  Median :0.6400     Median :0.7200   Median :4.000   Median :198.0       
##  Mean   :0.6117     Mean   :0.7143   Mean   :3.783   Mean   :200.6       
##  3rd Qu.:0.8100     3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0       
##  Max.   :1.0000     Max.   :1.0000   Max.   :7.000   Max.   :310.0       
##  time_spend_company      left       
##  Min.   :2.000      Min.   :0.0000  
##  1st Qu.:3.000      1st Qu.:0.0000  
##  Median :3.000      Median :0.0000  
##  Mean   :3.165      Mean   :0.2451  
##  3rd Qu.:4.000      3rd Qu.:0.0000  
##  Max.   :5.000      Max.   :1.0000
## Test if mean(left) are significantly lower for X1 than for X2
alpha <- 0.05
t.test(X1$left, X2$left, alternative = 'less')
## 
##  Welch Two Sample t-test
## 
## data:  X1$left and X2$left
## t = -7.4918, df = 1623.6, p-value = 5.551e-14
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##         -Inf -0.06404151
## sample estimates:
## mean of x mean of y 
## 0.1630265 0.2450973
## Test if distributions of features are different between X1 and X2
ind <- c()
### satisfaction_level
if (ks.test(X1$satisfaction_level, X2$satisfaction_level)$p.value < alpha / 2) {
  ind <- rbind(ind, TRUE);
  print("The distribution of 'satisfaction_level' is statistically different between X1 and X2 at 0.05 level of significance.")
} else {
  ind <- rbind(ind, FALSE);
  print("The distribution of 'satisfaction_level' is NOT statistically different between X1 and X2 at 0.05 level of significance.")
}
## Warning in ks.test(X1$satisfaction_level, X2$satisfaction_level): p-value will
## be approximate in the presence of ties
## [1] "The distribution of 'satisfaction_level' is statistically different between X1 and X2 at 0.05 level of significance."
### last_evaluation
if (ks.test(X1$last_evaluation, X2$last_evaluation)$p.value < alpha / 2) {
  ind <- rbind(ind, TRUE);
  print("The distribution of 'last_evaluation' is statistically different between X1 and X2 at 0.05 level of significance.")
} else {
  ind <- rbind(ind, FALSE);
  print("The distribution of 'last_evaluation' is NOT statistically different between X1 and X2 at 0.05 level of significance.")
}
## Warning in ks.test(X1$last_evaluation, X2$last_evaluation): p-value will be
## approximate in the presence of ties
## [1] "The distribution of 'last_evaluation' is statistically different between X1 and X2 at 0.05 level of significance."
### number_project
if (ks.test(X1$number_project, X2$number_project)$p.value < alpha / 2) {
  ind <- rbind(ind, TRUE);
  print("The distribution of 'number_project' is statistically different between X1 and X2 at 0.05 level of significance.")
} else {
  ind <- rbind(ind, FALSE);
  print("The distribution of 'number_project' is NOT statistically different between X1 and X2 at 0.05 level of significance.")
}
## Warning in ks.test(X1$number_project, X2$number_project): p-value will be
## approximate in the presence of ties
## [1] "The distribution of 'number_project' is statistically different between X1 and X2 at 0.05 level of significance."
### average_montly_hours
if (ks.test(X1$average_montly_hours, X2$average_montly_hours)$p.value < alpha / 2) {
  ind <- rbind(ind, TRUE);
  print("The distribution of 'average_montly_hours' is statistically different between X1 and X2 at 0.05 level of significance.")
} else {
  ind <- rbind(ind, FALSE);
  print("The distribution of 'average_montly_hours' is NOT statistically different between X1 and X2 at 0.05 level of significance.")
}
## Warning in ks.test(X1$average_montly_hours, X2$average_montly_hours): p-value
## will be approximate in the presence of ties
## [1] "The distribution of 'average_montly_hours' is statistically different between X1 and X2 at 0.05 level of significance."
### time_spend_company
if (ks.test(X1$time_spend_company, X2$time_spend_company)$p.value < alpha / 2) {
  ind <- rbind(ind, TRUE);
  print("The distribution of 'time_spend_company' is statistically different between X1 and X2 at 0.05 level of significance.")
} else {
  ind <- rbind(ind, FALSE);
  print("The distribution of 'time_spend_company' is NOT statistically different between X1 and X2 at 0.05 level of significance.")
}
## Warning in ks.test(X1$time_spend_company, X2$time_spend_company): p-value will
## be approximate in the presence of ties
## [1] "The distribution of 'time_spend_company' is statistically different between X1 and X2 at 0.05 level of significance."
### left
if (ks.test(X1$left, X2$left)$p.value < alpha / 2) {
  ind <- rbind(ind, TRUE);
  print("The distribution of 'left' is statistically different between X1 and X2 at 0.05 level of significance.")
} else {
  ind <- rbind(ind, FALSE);
  print("The distribution of 'left' is NOT statistically different between X1 and X2 at 0.05 level of significance.")
}
## Warning in ks.test(X1$left, X2$left): p-value will be approximate in the
## presence of ties
## [1] "The distribution of 'left' is statistically different between X1 and X2 at 0.05 level of significance."
ind
##      [,1]
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
#### ks.test(X1$satisfaction_level, X2$satisfaction_level)
#### ks.test(X1$last_evaluation, X2$last_evaluation)
#### ks.test(X1$number_project, X2$number_project)
#### ks.test(X1$average_montly_hours, X2$average_montly_hours)
#### ks.test(X1$time_spend_company, X2$time_spend_company)
#### ks.test(X1$left, X2$left)


## Test if distributions of features are different between X1 and X
### ks.test(X1$satisfaction_level, X$satisfaction_level)
### ks.test(X1$last_evaluation, X$last_evaluation)
### ks.test(X1$number_project, X$number_project)
### ks.test(X1$average_montly_hours, X$average_montly_hours)
### ks.test(X1$time_spend_company, X$time_spend_company)
### ks.test(X1$left, X$left)

### All tests are significant at alpha = 0.05


## Test if distributions of features are different between X2 and X
### ks.test(X2$satisfaction_level, X$satisfaction_level)
### ks.test(X2$last_evaluation, X$last_evaluation)
### ks.test(X2$number_project, X$number_project)
### ks.test(X2$average_montly_hours, X$average_montly_hours)
### ks.test(X2$time_spend_company, X$time_spend_company)
### ks.test(X2$left, X$left)

### All tests are NOT significant at alpha = 0.05 except for "time_spend_company"



# Plot the scatter plot matrices for all variables
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 4.0.4
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Move target variable "left" to the end of the data frame
rawData <- rawData %>% relocate(left, .after = last_col())

pairs.panels(rawData, pch = '.')

pairs.panels(rawData[out_ind, ])

pairs.panels(rawData[-out_ind, ], pch = '.')

pairs.panels(rawData, pch = '.', ellipses = FALSE)

## pairs.panels(rawData)



# Exploratory factor (principal component, and cluster) analysis
## Standardization of numerical data
X_norm <- cbind(scale(X[ , -c(6)]), y)
X1_norm <- cbind(scale(X1[ , -c(6)]), y[out_ind])
X2_norm <- cbind(scale(X2[ , -c(6)]), y[-out_ind])


## On all observations
fa.parallel(X_norm[ , -c(6)])

## Parallel analysis suggests that the number of factors =  2  and the number of components =  2
fa1 <- fa(X_norm[ , -c(6)], 1)
fa1
## Factor Analysis using method =  minres
## Call: fa(r = X_norm[, -c(6)], nfactors = 1)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        MR1     h2   u2 com
## satisfaction_level   -0.07 0.0051 0.99   1
## last_evaluation       0.51 0.2626 0.74   1
## number_project        0.70 0.4938 0.51   1
## average_montly_hours  0.61 0.3673 0.63   1
## time_spend_company    0.26 0.0662 0.93   1
## 
##                 MR1
## SS loadings    1.19
## Proportion Var 0.24
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  0.48 with Chi Square of  7153.12
## The degrees of freedom for the model are 5  and the objective function was  0.06 
## 
## The root mean square of the residuals (RMSR) is  0.06 
## The df corrected root mean square of the residuals is  0.09 
## 
## The harmonic number of observations is  14999 with the empirical chi square  1142.95  with prob <  6.7e-245 
## The total number of observations was  14999  with Likelihood Chi Square =  839.32  with prob <  3.6e-179 
## 
## Tucker Lewis Index of factoring reliability =  0.766
## RMSEA index =  0.105  and the 90 % confidence intervals are  0.1 0.112
## BIC =  791.24
## Fit based upon off diagonal values = 0.93
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.82
## Multiple R square of scores with factors          0.67
## Minimum correlation of possible factor scores     0.33
fa.diagram(fa1)

fa2 <- fa(X_norm[ , -c(6)], 2)
## Loading required namespace: GPArotation
fa2
## Factor Analysis using method =  minres
## Call: fa(r = X_norm[, -c(6)], nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        MR1   MR2    h2   u2 com
## satisfaction_level   -0.01  0.71 0.502 0.50 1.0
## last_evaluation       0.57  0.20 0.346 0.65 1.2
## number_project        0.69 -0.14 0.504 0.50 1.1
## average_montly_hours  0.60  0.03 0.359 0.64 1.0
## time_spend_company    0.24 -0.12 0.077 0.92 1.4
## 
##                        MR1  MR2
## SS loadings           1.21 0.57
## Proportion Var        0.24 0.11
## Cumulative Var        0.24 0.36
## Proportion Explained  0.68 0.32
## Cumulative Proportion 0.68 1.00
## 
##  With factor correlations of 
##       MR1   MR2
## MR1  1.00 -0.08
## MR2 -0.08  1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  0.48 with Chi Square of  7153.12
## The degrees of freedom for the model are 1  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic number of observations is  14999 with the empirical chi square  22.37  with prob <  2.3e-06 
## The total number of observations was  14999  with Likelihood Chi Square =  21.01  with prob <  4.6e-06 
## 
## Tucker Lewis Index of factoring reliability =  0.972
## RMSEA index =  0.037  and the 90 % confidence intervals are  0.024 0.051
## BIC =  11.39
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.82 0.73
## Multiple R square of scores with factors          0.67 0.53
## Minimum correlation of possible factor scores     0.35 0.06
fa.diagram(fa2)

pca1 <- pca(X_norm[ , -c(6)], 1)
pca1
## Principal Components Analysis
## Call: principal(r = r, nfactors = nfactors, residuals = residuals, 
##     rotate = rotate, n.obs = n.obs, covar = covar, scores = scores, 
##     missing = missing, impute = impute, oblique.scores = oblique.scores, 
##     method = method, use = use, cor = cor, correct = 0.5, weight = NULL)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PC1    h2   u2 com
## satisfaction_level   -0.12 0.014 0.99   1
## last_evaluation       0.69 0.471 0.53   1
## number_project        0.78 0.614 0.39   1
## average_montly_hours  0.74 0.552 0.45   1
## time_spend_company    0.42 0.179 0.82   1
## 
##                 PC1
## SS loadings    1.83
## Proportion Var 0.37
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.15 
##  with the empirical chi square  6422.41  with prob <  0 
## 
## Fit based upon off diagonal values = 0.59
fa.diagram(pca1)

pca2 <- pca(X_norm[ , -c(6)], 2)
pca2
## Principal Components Analysis
## Call: principal(r = r, nfactors = nfactors, residuals = residuals, 
##     rotate = rotate, n.obs = n.obs, covar = covar, scores = scores, 
##     missing = missing, impute = impute, oblique.scores = oblique.scores, 
##     method = method, use = use, cor = cor, correct = 0.5, weight = NULL)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                       RC1   RC2   h2   u2 com
## satisfaction_level   0.15  0.87 0.78 0.22 1.1
## last_evaluation      0.77  0.16 0.62 0.38 1.1
## number_project       0.71 -0.35 0.63 0.37 1.5
## average_montly_hours 0.75 -0.10 0.57 0.43 1.0
## time_spend_company   0.28 -0.51 0.34 0.66 1.6
## 
##                        RC1  RC2
## SS loadings           1.76 1.17
## Proportion Var        0.35 0.23
## Cumulative Var        0.35 0.59
## Proportion Explained  0.60 0.40
## Cumulative Proportion 0.60 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.16 
##  with the empirical chi square  7832.32  with prob <  0 
## 
## Fit based upon off diagonal values = 0.5
fa.diagram(pca2)

### Existence of hierarchical latent factor
om <- omega(X_norm[ , -c(6)], 2, sl = FALSE)
## 
## Three factors are required for identification -- general factor loadings set to be equal. 
## Proceed with caution. 
## Think about redoing the analysis with alternative values of the 'option' setting.

om
## Omega 
## Call: omegah(m = m, nfactors = nfactors, fm = fm, key = key, flip = flip, 
##     digits = digits, title = title, sl = sl, labels = labels, 
##     plot = plot, n.obs = n.obs, rotate = rotate, Phi = Phi, option = option, 
##     covar = covar)
## Alpha:                 0.51 
## G.6:                   0.5 
## Omega Hierarchical:    0.08 
## Omega H asymptotic:    0.12 
## Omega Total            0.62 
## 
## Schmid Leiman Factor loadings greater than  0.2 
##                          g   F1*   F2*   h2   u2   p2
## satisfaction_level-   0.20       -0.68 0.50 0.50 0.08
## last_evaluation             0.55       0.35 0.65 0.03
## number_project        0.23  0.66       0.50 0.50 0.11
## average_montly_hours        0.58       0.36 0.64 0.07
## time_spend_company          0.23       0.08 0.92 0.13
## 
## With eigenvalues of:
##    g  F1*  F2* 
## 0.14 1.12 0.53 
## 
## general/max  0.13   max/min =   2.11
## mean percent general =  0.08    with sd =  0.04 and cv of  0.46 
## Explained Common Variance of the general factor =  0.08 
## 
## The degrees of freedom are 1  and the fit is  0 
## The number of observations was  14999  with Chi Square =  21.01  with prob <  4.6e-06
## The root mean square of the residuals is  0.01 
## The df corrected root mean square of the residuals is  0.03
## RMSEA index =  0.037  and the 10 % confidence intervals are  0.024 0.051
## BIC =  11.39
## 
## Compare this with the adequacy of just a general factor and no group factors
## The degrees of freedom for just the general factor are 5  and the fit is  0.4 
## The number of observations was  14999  with Chi Square =  5994.02  with prob <  0
## The root mean square of the residuals is  0.21 
## The df corrected root mean square of the residuals is  0.3 
## 
## RMSEA index =  0.283  and the 10 % confidence intervals are  0.277 0.289
## BIC =  5945.94 
## 
## Measures of factor score adequacy             
##                                                   g  F1*   F2*
## Correlation of scores with factors             0.30 0.79  0.70
## Multiple R square of scores with factors       0.09 0.62  0.49
## Minimum correlation of factor score estimates -0.82 0.24 -0.03
## 
##  Total, General and Subset omega for each subset
##                                                  g  F1*  F2*
## Omega total for total scores and subscales    0.62 0.62 0.50
## Omega general for total scores and subscales  0.08 0.05 0.04
## Omega group for total scores and subscales    0.53 0.57 0.46
### Item cluster analysis
ic <- iclust(X_norm[ , -c(6)])

ic
## ICLUST (Item Cluster Analysis)
## Call: iclust(r.mat = X_norm[, -c(6)])
## 
## Purified Alpha:
## [1] 0.51
## 
## G6* reliability:
## [1] 0.76
## 
## Original Beta:
## [1] 0.25
## 
## Cluster size:
## [1] 5
## 
## Item by Cluster Structure matrix:
##                      [,1] 
## satisfaction_level   -0.10
## last_evaluation       0.44
## number_project        0.67
## average_montly_hours  0.55
## time_spend_company    0.30
## 
## With eigenvalues of:
## [1] 1
## 
## Purified scale intercorrelations
##  reliabilities on diagonal
##  correlations corrected for attenuation above diagonal: 
##      [,1]
## [1,] 0.51
## 
## Cluster fit =  0.44   Pattern fit =  0.98  RMSR =  0.07
### Visualization
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#### 2-dim PCA
data_PCA <- as.data.frame(cbind(pca2$scores, y))
fig_pca <- plot_ly(data_PCA, x = ~RC1, y = ~RC2, z = ~y, type = 'scatter3d',
                   size = I(5), color = y, colors = c('gray50', 'red'))
fig_pca <- fig_pca %>% layout(
  title = 'Visualization with Numeric Variables and 2 PCs',
  scene = list(
    xaxis = list(title = "PC1"),
    yaxis = list(title = "PC2"),
    zaxis = list(title = "left")
  ))

fig_pca
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
#### 2-dim FA
data_FA <- as.data.frame(cbind(fa2$scores, y))
fig_fa <- plot_ly(data_FA, x = ~MR1, y = ~MR2, z = ~y, type = 'scatter3d',
                  size = I(5), color = y, colors = c('gray50', 'red'))
fig_fa <- fig_fa %>% layout(
  title = 'Visualization with Numeric Variables and 2 Factors',
  scene = list(
    xaxis = list(title = "Factor 1"),
    yaxis = list(title = "Factor 2"),
    zaxis = list(title = "left")
  ))

fig_fa
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## On X1
fa.parallel(X1_norm[ , -c(6)])

## Parallel analysis suggests that the number of factors =  0  and the number of components =  2
pca1_X1 <- pca(X1_norm[ , -c(6)], 1)
pca1_X1
## Principal Components Analysis
## Call: principal(r = r, nfactors = nfactors, residuals = residuals, 
##     rotate = rotate, n.obs = n.obs, covar = covar, scores = scores, 
##     missing = missing, impute = impute, oblique.scores = oblique.scores, 
##     method = method, use = use, cor = cor, correct = 0.5, weight = NULL)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PC1     h2   u2 com
## satisfaction_level    0.08 0.0071 0.99   1
## last_evaluation       0.65 0.4229 0.58   1
## number_project        0.60 0.3603 0.64   1
## average_montly_hours  0.68 0.4586 0.54   1
## time_spend_company   -0.47 0.2237 0.78   1
## 
##                 PC1
## SS loadings    1.47
## Proportion Var 0.29
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.18 
##  with the empirical chi square  806.46  with prob <  4.6e-172 
## 
## Fit based upon off diagonal values = -0.49
fa.diagram(pca1_X1)

pca2_X1 <- pca(X1_norm[ , -c(6)], 2)
pca2_X1
## Principal Components Analysis
## Call: principal(r = r, nfactors = nfactors, residuals = residuals, 
##     rotate = rotate, n.obs = n.obs, covar = covar, scores = scores, 
##     missing = missing, impute = impute, oblique.scores = oblique.scores, 
##     method = method, use = use, cor = cor, correct = 0.5, weight = NULL)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        RC1   RC2   h2   u2 com
## satisfaction_level    0.53  0.58 0.61 0.39 2.0
## last_evaluation       0.73 -0.10 0.54 0.46 1.0
## number_project        0.24 -0.69 0.53 0.47 1.2
## average_montly_hours  0.71 -0.18 0.53 0.47 1.1
## time_spend_company   -0.10  0.66 0.45 0.55 1.0
## 
##                        RC1  RC2
## SS loadings           1.38 1.29
## Proportion Var        0.28 0.26
## Cumulative Var        0.28 0.53
## Proportion Explained  0.52 0.48
## Cumulative Proportion 0.52 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.19 
##  with the empirical chi square  919.02  with prob <  7.2e-202 
## 
## Fit based upon off diagonal values = -0.7
fa.diagram(pca2_X1)

### Existence of hierarchical latent factor
om_X1 <- omega(X1_norm[ , -c(6)], 2, sl = FALSE)
## 
## Three factors are required for identification -- general factor loadings set to be equal. 
## Proceed with caution. 
## Think about redoing the analysis with alternative values of the 'option' setting.

om_X1
## Omega 
## Call: omegah(m = m, nfactors = nfactors, fm = fm, key = key, flip = flip, 
##     digits = digits, title = title, sl = sl, labels = labels, 
##     plot = plot, n.obs = n.obs, rotate = rotate, Phi = Phi, option = option, 
##     covar = covar)
## Alpha:                 0.33 
## G.6:                   0.32 
## Omega Hierarchical:    0.13 
## Omega H asymptotic:    0.3 
## Omega Total            0.43 
## 
## Schmid Leiman Factor loadings greater than  0.2 
##                         g   F1*  F2*   h2   u2   p2
## satisfaction_level-       -0.30 0.33 0.20 0.80 0.00
## last_evaluation      0.23  0.46      0.27 0.73 0.20
## number_project       0.27       0.44 0.28 0.72 0.26
## average_montly_hours 0.24  0.43      0.25 0.75 0.23
## time_spend_company-             0.31 0.13 0.87 0.23
## 
## With eigenvalues of:
##    g  F1*  F2* 
## 0.22 0.51 0.41 
## 
## general/max  0.42   max/min =   1.26
## mean percent general =  0.18    with sd =  0.11 and cv of  0.57 
## Explained Common Variance of the general factor =  0.19 
## 
## The degrees of freedom are 1  and the fit is  0 
## The number of observations was  1282  with Chi Square =  2.17  with prob <  0.14
## The root mean square of the residuals is  0.01 
## The df corrected root mean square of the residuals is  0.03
## RMSEA index =  0.03  and the 10 % confidence intervals are  0 0.087
## BIC =  -4.99
## 
## Compare this with the adequacy of just a general factor and no group factors
## The degrees of freedom for just the general factor are 5  and the fit is  0.12 
## The number of observations was  1282  with Chi Square =  155.42  with prob <  9.4e-32
## The root mean square of the residuals is  0.11 
## The df corrected root mean square of the residuals is  0.16 
## 
## RMSEA index =  0.153  and the 10 % confidence intervals are  0.133 0.174
## BIC =  119.64 
## 
## Measures of factor score adequacy             
##                                                   g   F1*   F2*
## Correlation of scores with factors             0.38  0.61  0.57
## Multiple R square of scores with factors       0.15  0.37  0.32
## Minimum correlation of factor score estimates -0.70 -0.26 -0.36
## 
##  Total, General and Subset omega for each subset
##                                                  g  F1*  F2*
## Omega total for total scores and subscales    0.43 0.41 0.37
## Omega general for total scores and subscales  0.13 0.09 0.06
## Omega group for total scores and subscales    0.29 0.32 0.31
### Item cluster analysis
ic_X1 <- iclust(X1_norm[ , -c(6)])

ic_X1
## ICLUST (Item Cluster Analysis)
## Call: iclust(r.mat = X1_norm[, -c(6)])
## 
## Purified Alpha:
## [1] 0.33
## 
## G6* reliability:
## [1] 0.12
## 
## Original Beta:
## [1] 0.16
## 
## Cluster size:
## [1] 5
## 
## Item by Cluster Structure matrix:
##                      [,1] 
## satisfaction_level    0.01
## last_evaluation      -0.30
## number_project       -0.47
## average_montly_hours -0.35
## time_spend_company    0.34
## 
## With eigenvalues of:
## [1] 0.55
## 
## Purified scale intercorrelations
##  reliabilities on diagonal
##  correlations corrected for attenuation above diagonal: 
##      [,1]
## [1,] 0.33
## 
## Cluster fit =  0.24   Pattern fit =  0.97  RMSR =  0.08
### Visualization
#### 2-dim PCA
y1 <- y[out_ind]
data_PCA_X1 <- as.data.frame(cbind(pca2_X1$scores, y1))
fig_pca_X1 <- plot_ly(data_PCA_X1, x = ~RC1, y = ~RC2, z = ~y1, 
                      type = 'scatter3d', size = I(8), 
                      color = y1, colors = c('gray50', 'red'))
fig_pca_X1 <- fig_pca_X1 %>% layout(
                      title = 'Visualization with Numeric Variables and 2 PCs (X1)',
                      scene = list(
                        xaxis = list(title = "PC1"),
                        yaxis = list(title = "PC2"),
                        zaxis = list(title = "left")
                      ))

fig_pca_X1
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## On X2
fa.parallel(X2_norm[ , -c(6)])

## Parallel analysis suggests that the number of factors =  2  and the number of components =  2
fa1_X2 <- fa(X2_norm[ , -c(6)], 1)
fa1_X2
## Factor Analysis using method =  minres
## Call: fa(r = X2_norm[, -c(6)], nfactors = 1)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        MR1    h2   u2 com
## satisfaction_level   -0.12 0.015 0.99   1
## last_evaluation       0.50 0.253 0.75   1
## number_project        0.74 0.554 0.45   1
## average_montly_hours  0.60 0.357 0.64   1
## time_spend_company    0.40 0.158 0.84   1
## 
##                 MR1
## SS loadings    1.34
## Proportion Var 0.27
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  0.61 with Chi Square of  8311.15
## The degrees of freedom for the model are 5  and the objective function was  0.09 
## 
## The root mean square of the residuals (RMSR) is  0.08 
## The df corrected root mean square of the residuals is  0.11 
## 
## The harmonic number of observations is  13717 with the empirical chi square  1687.77  with prob <  0 
## The total number of observations was  13717  with Likelihood Chi Square =  1169.63  with prob <  1.1e-250 
## 
## Tucker Lewis Index of factoring reliability =  0.719
## RMSEA index =  0.13  and the 90 % confidence intervals are  0.124 0.137
## BIC =  1122
## Fit based upon off diagonal values = 0.91
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.84
## Multiple R square of scores with factors          0.70
## Minimum correlation of possible factor scores     0.40
fa.diagram(fa1_X2)

fa2_X2 <- fa(X2_norm[ , -c(6)], 2)
fa2_X2
## Factor Analysis using method =  minres
## Call: fa(r = X2_norm[, -c(6)], nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                       MR1   MR2   h2   u2 com
## satisfaction_level   0.00  0.93 0.86 0.14 1.0
## last_evaluation      0.57  0.17 0.33 0.67 1.2
## number_project       0.70 -0.09 0.52 0.48 1.0
## average_montly_hours 0.61  0.03 0.37 0.63 1.0
## time_spend_company   0.37 -0.19 0.18 0.82 1.5
## 
##                        MR1  MR2
## SS loadings           1.33 0.93
## Proportion Var        0.27 0.19
## Cumulative Var        0.27 0.45
## Proportion Explained  0.59 0.41
## Cumulative Proportion 0.59 1.00
## 
##  With factor correlations of 
##      MR1  MR2
## MR1  1.0 -0.1
## MR2 -0.1  1.0
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  0.61 with Chi Square of  8311.15
## The degrees of freedom for the model are 1  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.04 
## 
## The harmonic number of observations is  13717 with the empirical chi square  35.33  with prob <  2.8e-09 
## The total number of observations was  13717  with Likelihood Chi Square =  41.16  with prob <  1.4e-10 
## 
## Tucker Lewis Index of factoring reliability =  0.952
## RMSEA index =  0.054  and the 90 % confidence intervals are  0.041 0.069
## BIC =  31.63
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.83 0.93
## Multiple R square of scores with factors          0.70 0.86
## Minimum correlation of possible factor scores     0.39 0.72
fa.diagram(fa2_X2)

pca1_X2 <- pca(X2_norm[ , -c(6)], 1)
pca1_X2
## Principal Components Analysis
## Call: principal(r = r, nfactors = nfactors, residuals = residuals, 
##     rotate = rotate, n.obs = n.obs, covar = covar, scores = scores, 
##     missing = missing, impute = impute, oblique.scores = oblique.scores, 
##     method = method, use = use, cor = cor, correct = 0.5, weight = NULL)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PC1   h2   u2 com
## satisfaction_level   -0.20 0.04 0.96   1
## last_evaluation       0.66 0.43 0.57   1
## number_project        0.79 0.63 0.37   1
## average_montly_hours  0.73 0.53 0.47   1
## time_spend_company    0.58 0.33 0.67   1
## 
##                 PC1
## SS loadings    1.96
## Proportion Var 0.39
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.16 
##  with the empirical chi square  6683.92  with prob <  0 
## 
## Fit based upon off diagonal values = 0.65
fa.diagram(pca1_X2)

pca2_X2 <- pca(X2_norm[ , -c(6)], 2)
pca2_X2
## Principal Components Analysis
## Call: principal(r = r, nfactors = nfactors, residuals = residuals, 
##     rotate = rotate, n.obs = n.obs, covar = covar, scores = scores, 
##     missing = missing, impute = impute, oblique.scores = oblique.scores, 
##     method = method, use = use, cor = cor, correct = 0.5, weight = NULL)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                       RC1   RC2   h2   u2 com
## satisfaction_level   0.14  0.88 0.79 0.21 1.1
## last_evaluation      0.78  0.16 0.63 0.37 1.1
## number_project       0.72 -0.32 0.63 0.37 1.4
## average_montly_hours 0.75 -0.09 0.57 0.43 1.0
## time_spend_company   0.38 -0.60 0.51 0.49 1.7
## 
##                        RC1  RC2
## SS loadings           1.85 1.27
## Proportion Var        0.37 0.25
## Cumulative Var        0.37 0.62
## Proportion Explained  0.59 0.41
## Cumulative Proportion 0.59 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.15 
##  with the empirical chi square  6248.54  with prob <  0 
## 
## Fit based upon off diagonal values = 0.67
fa.diagram(pca2_X2)

### Existence of hierarchical latent factor
om_X2 <- omega(X2_norm[ , -c(6)], 2, sl = FALSE)
## 
## Three factors are required for identification -- general factor loadings set to be equal. 
## Proceed with caution. 
## Think about redoing the analysis with alternative values of the 'option' setting.

om_X2
## Omega 
## Call: omegah(m = m, nfactors = nfactors, fm = fm, key = key, flip = flip, 
##     digits = digits, title = title, sl = sl, labels = labels, 
##     plot = plot, n.obs = n.obs, rotate = rotate, Phi = Phi, option = option, 
##     covar = covar)
## Alpha:                 0.58 
## G.6:                   0.56 
## Omega Hierarchical:    0.12 
## Omega H asymptotic:    0.16 
## Omega Total            0.7 
## 
## Schmid Leiman Factor loadings greater than  0.2 
##                          g   F1*   F2*   h2   u2   p2
## satisfaction_level-   0.30       -0.88 0.86 0.14 0.10
## last_evaluation             0.54       0.33 0.67 0.05
## number_project        0.25  0.67       0.52 0.48 0.12
## average_montly_hours        0.58       0.37 0.63 0.09
## time_spend_company          0.35       0.18 0.82 0.17
## 
## With eigenvalues of:
##    g  F1*  F2* 
## 0.23 1.19 0.84 
## 
## general/max  0.19   max/min =   1.43
## mean percent general =  0.11    with sd =  0.04 and cv of  0.41 
## Explained Common Variance of the general factor =  0.1 
## 
## The degrees of freedom are 1  and the fit is  0 
## The number of observations was  13717  with Chi Square =  41.16  with prob <  1.4e-10
## The root mean square of the residuals is  0.01 
## The df corrected root mean square of the residuals is  0.04
## RMSEA index =  0.054  and the 10 % confidence intervals are  0.041 0.069
## BIC =  31.63
## 
## Compare this with the adequacy of just a general factor and no group factors
## The degrees of freedom for just the general factor are 5  and the fit is  0.47 
## The number of observations was  13717  with Chi Square =  6504.72  with prob <  0
## The root mean square of the residuals is  0.23 
## The df corrected root mean square of the residuals is  0.33 
## 
## RMSEA index =  0.308  and the 10 % confidence intervals are  0.302 0.314
## BIC =  6457.08 
## 
## Measures of factor score adequacy             
##                                                   g  F1*  F2*
## Correlation of scores with factors             0.38 0.79 0.88
## Multiple R square of scores with factors       0.15 0.63 0.78
## Minimum correlation of factor score estimates -0.71 0.25 0.55
## 
##  Total, General and Subset omega for each subset
##                                                  g  F1*  F2*
## Omega total for total scores and subscales    0.70 0.66 0.86
## Omega general for total scores and subscales  0.12 0.07 0.09
## Omega group for total scores and subscales    0.57 0.59 0.77
### Item cluster analysis
ic_X2 <- iclust(X2_norm[ , -c(6)])

ic_X2
## ICLUST (Item Cluster Analysis)
## Call: iclust(r.mat = X2_norm[, -c(6)])
## 
## Purified Alpha:
## [1] 0.58
## 
## G6* reliability:
## [1] 0.78
## 
## Original Beta:
## [1] 0.35
## 
## Cluster size:
## [1] 5
## 
## Item by Cluster Structure matrix:
##                      [,1] 
## satisfaction_level   -0.16
## last_evaluation       0.44
## number_project        0.67
## average_montly_hours  0.55
## time_spend_company    0.46
## 
## With eigenvalues of:
## [1] 1.2
## 
## Purified scale intercorrelations
##  reliabilities on diagonal
##  correlations corrected for attenuation above diagonal: 
##      [,1]
## [1,] 0.58
## 
## Cluster fit =  0.51   Pattern fit =  0.98  RMSR =  0.09
### Visualization
#### 2-dim PCA
y2 <- y[-out_ind]
data_PCA_X2 <- as.data.frame(cbind(pca2_X2$scores, y2))
fig_pca_X2 <- plot_ly(data_PCA_X2, x = ~RC1, y = ~RC2, z = ~y2, type = 'scatter3d',
                   size = I(5), color = y2, colors = c('gray50', 'red'))
fig_pca_X2 <- fig_pca_X2 %>% layout(
  title = 'Visualization with Numeric Variables and 2 PCs (X2)',
  scene = list(
    xaxis = list(title = "PC1"),
    yaxis = list(title = "PC2"),
    zaxis = list(title = "left")
  ))

fig_pca_X2
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
#### 2-dim FA
data_FA_X2 <- as.data.frame(cbind(fa2_X2$scores, y2))
fig_fa_X2 <- plot_ly(data_FA_X2, x = ~MR1, y = ~MR2, z = ~y2, type = 'scatter3d',
                  size = I(5), color = y2, colors = c('gray50', 'red'))
fig_fa_X2 <- fig_fa_X2 %>% layout(
  title = 'Visualization with Numeric Variables and 2 Factors (X2)',
  scene = list(
    xaxis = list(title = "Factor 1"),
    yaxis = list(title = "Factor 2"),
    zaxis = list(title = "left")
  ))

fig_fa_X2
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
# Confirmatory factor analysis
library(lavaan)
## Warning: package 'lavaan' was built under R version 4.0.4
## This is lavaan 0.6-8
## lavaan is FREE software! Please report any bugs.
## 
## Attaching package: 'lavaan'
## The following object is masked from 'package:psych':
## 
##     cor2cov
## One-factor model
m1 <- 'f1 =~  number_project + average_montly_hours + last_evaluation'
CFA1 <- cfa(m1, X_norm[ , -c(6)])
summary(CFA1, fit.measures=TRUE,standardized=TRUE)
## lavaan 0.6-8 ended normally after 21 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         6
##                                                       
##   Number of observations                         14999
##                                                       
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Model Test Baseline Model:
## 
##   Test statistic                              5619.829
##   Degrees of freedom                                 3
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000
##   Tucker-Lewis Index (TLI)                       1.000
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -61036.562
##   Loglikelihood unrestricted model (H1)     -61036.562
##                                                       
##   Akaike (AIC)                              122085.125
##   Bayesian (BIC)                            122130.819
##   Sample-size adjusted Bayesian (BIC)       122111.752
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000
##   90 Percent confidence interval - lower         0.000
##   90 Percent confidence interval - upper         0.000
##   P-value RMSEA <= 0.05                             NA
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   f1 =~                                                                 
##     number_project    1.000                               0.655    0.655
##     avrg_mntly_hrs    0.973    0.025   39.067    0.000    0.637    0.637
##     last_evaluatin    0.814    0.020   39.777    0.000    0.533    0.533
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .number_project    0.571    0.012   46.062    0.000    0.571    0.571
##    .avrg_mntly_hrs    0.594    0.012   49.236    0.000    0.594    0.594
##    .last_evaluatin    0.715    0.011   66.229    0.000    0.715    0.716
##     f1                0.429    0.014   30.329    0.000    1.000    1.000
## Two-factor model
m2 <- 'f1 =~ number_project + average_montly_hours + last_evaluation
        f2 =~ time_spend_company' 
CFA2 <- cfa(m2, X_norm[ , -c(6)]) 
summary(CFA2,fit.measures=TRUE,standardized=TRUE)
## lavaan 0.6-8 ended normally after 22 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         8
##                                                       
##   Number of observations                         14999
##                                                       
## Model Test User Model:
##                                                       
##   Test statistic                                48.919
##   Degrees of freedom                                 2
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                              6302.782
##   Degrees of freedom                                 6
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.993
##   Tucker-Lewis Index (TLI)                       0.978
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -82001.705
##   Loglikelihood unrestricted model (H1)     -81977.245
##                                                       
##   Akaike (AIC)                              164019.409
##   Bayesian (BIC)                            164080.335
##   Sample-size adjusted Bayesian (BIC)       164054.912
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.040
##   90 Percent confidence interval - lower         0.030
##   90 Percent confidence interval - upper         0.050
##   P-value RMSEA <= 0.05                          0.958
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.013
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   f1 =~                                                                 
##     number_project    1.000                               0.675    0.675
##     avrg_mntly_hrs    0.917    0.022   41.080    0.000    0.619    0.619
##     last_evaluatin    0.786    0.019   40.614    0.000    0.530    0.530
##   f2 =~                                                                 
##     tim_spnd_cmpny    1.000                               1.000    1.000
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   f1 ~~                                                                 
##     f2                0.171    0.007   23.575    0.000    0.254    0.254
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .number_project    0.544    0.012   44.679    0.000    0.544    0.544
##    .avrg_mntly_hrs    0.617    0.011   54.339    0.000    0.617    0.617
##    .last_evaluatin    0.719    0.011   67.455    0.000    0.719    0.719
##    .tim_spnd_cmpny    0.000                               0.000    0.000
##     f1                0.456    0.014   32.005    0.000    1.000    1.000
##     f2                1.000    0.012   86.600    0.000    1.000    1.000
## Visualization of CFA
library(tidySEM)
## Warning: package 'tidySEM' was built under R version 4.0.4
## Registered S3 methods overwritten by 'tidySEM':
##   method              from           
##   print.mplus.model   MplusAutomation
##   print.mplusObject   MplusAutomation
##   summary.mplus.model MplusAutomation
## 
## Attaching package: 'tidySEM'
## The following object is masked from 'package:plotly':
## 
##     add_paths
graph_sem(CFA1)

graph_sem(CFA2)

save.image("D:/Yaxin/HKBU BM/Courses/Sem 2/ECON7860 Big Data Analytics for Business (S11)/Group Project/HR Analytics/working/EDA.RData")